In the following notebook we're going to compare all the approaches we developed and test them on a small amount of professions for which we have precise statics about. We will use both FastText's and Gonen's embeddings.
# general
import pandas as pd
import numpy as np
import scipy.stats
import codecs
from matplotlib import pyplot as plt
from numpy import linalg as LA
# bolukbasi's
import debiaswe as dwe
import debiaswe.we as we
from debiaswe.we import WordEmbedding
# mine
import db_functions as db
We load all the emdeddings using the same methods and applying the same preprocessing in order to have a common start.
# load Gonen's emdebbings
E_g = WordEmbedding('../gonen/embeddings/it_lemma_to_fem', 'g')
# load FastText's emdeddings
E_ft = WordEmbedding('../bolukbasi/mio/embeddings/cc.it.300.vec', 'ft')
# gender direction
g_diff = E_ft.diff('lui', 'lei')
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
gender_pairs = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
# PCA
pca = we.doPCA(gender_pairs, E_ft)
# PCA components
pc = pca.components_
# singular values
sv = pca.singular_values_
print("Singular values:")
print(sv)
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
# gender direction
g_pca_0 = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))
'''
S_word = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
S_word = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
G_word = db.read('gram_def_mf.txt', 'mf')
S_m_v = []
for w in S_word:
S_m_v.append(E_ft.v(w[1]))
S_f_v = []
for w in S_word:
S_f_v.append(E_ft.v(w[0]))
G_m_v = []
for w in G_word:
G_m_v.append(E_ft.v(w[0]))
G_f_v = []
for w in G_word:
G_f_v.append(E_ft.v(w[1]))
S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)
# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)
# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)
# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_ft, prof_ung_truth, g_diff, 'istat')
# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_ung_truth, g_pca_0, 'istat')
# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_ung_truth, g_pca_01, 'istat')
# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_ft, prof_ung_truth, g_e, 'istat')
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []
for i in range(len(proj_truth_g_diff)):
truth_col0.append(proj_truth_g_diff[i][0]) # profession
truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
truth_col2.append(proj_truth_g_pca_0[i][1]) # projection on g_pca_0
truth_col3.append(proj_truth_g_pca_01[i][1]) # projection on g_pca_01
truth_col4.append(np.real(proj_truth_g_e[i][1])) # projection on g_e
truth_col5.append(proj_truth_g_diff[i][2]) # %male
truth_col6.append(proj_truth_g_diff[i][3]) # %female
truth_data = {'Profession':truth_col0,
'Proj g_diff':truth_col1,
'Proj g_pca_0':truth_col2,
'Proj g_pca_01':truth_col3,
'Prog g_e':truth_col4,
'% male':truth_col5,
'% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
truth_table.to_csv('truth_ung_ft.csv', index=False)
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
pearson_data = {'g_diff':r_g_diff,
'g_pca_0':r_g_pca_0,
'g_pca_01':r_g_pca_01,
'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_diff')
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_pca_0')
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_pca_01')
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_g_e')
# scatter plots
plt.figure(figsize=(18,18))
ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_all')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_pca_0')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_diff_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_0_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_0_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_g_pca_01_vs_g_e')
import importlib
importlib.reload(db)
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_ft, prof_gen_truth, g_diff, 'istat-mf')
# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_gen_truth, g_pca_0, 'istat-mf')
# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_gen_truth, g_pca_01, 'istat-mf')
# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_ft, prof_gen_truth, g_e, 'istat-mf')
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []
for i in range(len(mf_proj_truth_g_diff)):
mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
mf_truth_col2.append(mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
mf_truth_col3.append(mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
mf_truth_col4.append(np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
mf_truth_data = {'Male profession':mf_truth_col0,
'M-Proj g_diff':mf_truth_col1,
'M-Proj g_pca_0':mf_truth_col2,
'M-Proj g_pca_01':mf_truth_col3,
'M-Prog g_e':mf_truth_col4,
'Female profession':mf_truth_col5,
'F-Proj g_diff':mf_truth_col6,
'F-Proj g_pca_0':mf_truth_col7,
'F-Proj g_pca_01':mf_truth_col8,
'F-Prog g_e':mf_truth_col9,
'% female':mf_truth_col10}
mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
mf_truth_table.to_csv('truth_gen_ft.csv', index=False)
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
pearson_data = {'g_diff':r_mean_g_diff,
'g_pca_0':r_mean_g_pca_0,
'g_pca_01':r_mean_g_pca_01,
'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_diff')
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_diff_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_0')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_0_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_01')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_pca_01_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_e')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_g_e_mean')
from debiaswe.debias import debias
debias(E_ft, [], G_word, [])
# gender direction
g_diff = E_ft.diff('lui', 'lei')
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
gender_pairs = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
# PCA
pca = we.doPCA(gender_pairs, E_ft)
# PCA components
pc = pca.components_
# singular values
sv = pca.singular_values_
print("Singular values:")
print(sv)
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
# gender direction
g_pca_0 = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))
'''
S_word = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
S_word = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
G_word = db.read('gram_def_mf.txt', 'mf')
S_m_v = []
for w in S_word:
S_m_v.append(E_ft.v(w[1]))
S_f_v = []
for w in S_word:
S_f_v.append(E_ft.v(w[0]))
G_m_v = []
for w in G_word:
G_m_v.append(E_ft.v(w[0]))
G_f_v = []
for w in G_word:
G_f_v.append(E_ft.v(w[1]))
S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)
# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)
# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)
# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_ft, prof_ung_truth, g_diff, 'istat')
# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_ung_truth, g_pca_0, 'istat')
# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_ung_truth, g_pca_01, 'istat')
# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_ft, prof_ung_truth, g_e, 'istat')
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []
for i in range(len(proj_truth_g_diff)):
truth_col0.append(proj_truth_g_diff[i][0]) # profession
truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
truth_col2.append(proj_truth_g_pca_0[i][1]) # projection on g_pca_0
truth_col3.append(proj_truth_g_pca_01[i][1]) # projection on g_pca_01
truth_col4.append(np.real(proj_truth_g_e[i][1])) # projection on g_e
truth_col5.append(proj_truth_g_diff[i][2]) # %male
truth_col6.append(proj_truth_g_diff[i][3]) # %female
truth_data = {'Profession':truth_col0,
'Proj g_diff':truth_col1,
'Proj g_pca_0':truth_col2,
'Proj g_pca_01':truth_col3,
'Prog g_e':truth_col4,
'% male':truth_col5,
'% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
truth_table.to_csv('truth_ung_ft_deb.csv', index=False)
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
pearson_data = {'g_diff':r_g_diff,
'g_pca_0':r_g_pca_0,
'g_pca_01':r_g_pca_01,
'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_diff')
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_pca_0')
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_pca_01')
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/ft_deb_g_e')
# scatter plots
plt.figure(figsize=(18,18))
ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_all')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_pca_0')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_diff_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_0_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_0_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/ft_deb_g_pca_01_vs_g_e')
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_ft, prof_gen_truth, g_diff, 'istat-mf')
# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_ft, prof_gen_truth, g_pca_0, 'istat-mf')
# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_ft, prof_gen_truth, g_pca_01, 'istat-mf')
# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_ft, prof_gen_truth, g_e, 'istat-mf')
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []
for i in range(len(mf_proj_truth_g_diff)):
mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
mf_truth_col2.append(mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
mf_truth_col3.append(mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
mf_truth_col4.append(np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
mf_truth_data = {'Male profession':mf_truth_col0,
'M-Proj g_diff':mf_truth_col1,
'M-Proj g_pca_0':mf_truth_col2,
'M-Proj g_pca_01':mf_truth_col3,
'M-Prog g_e':mf_truth_col4,
'Female profession':mf_truth_col5,
'F-Proj g_diff':mf_truth_col6,
'F-Proj g_pca_0':mf_truth_col7,
'F-Proj g_pca_01':mf_truth_col8,
'F-Prog g_e':mf_truth_col9,
'% female':mf_truth_col10}
mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
truth_table.to_csv('truth_gen_ft_deb.csv', index=False)
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
pearson_data = {'g_diff':r_mean_g_diff,
'g_pca_0':r_mean_g_pca_0,
'g_pca_01':r_mean_g_pca_01,
'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_diff')
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_diff_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_0')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_0_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_01')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_pca_01_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_e')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/ft_deb_g_e_mean')
# gender direction
g_diff = E_g.diff('lui', 'lei')
# list of pairs to define gender
'''
gender_pairs = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
gender_pairs = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
# PCA
pca = we.doPCA(gender_pairs, E_g)
# PCA components
pc = pca.components_
# singular values
sv = pca.singular_values_
print("Singular values:")
print(sv)
plt.bar(range(pca.n_components_), sv)
plt.title("Singular values")
plt.show()
# gender direction
g_pca_0 = pc[0]/LA.norm(pc[0])
g_pca_01 = (sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1])/LA.norm((sv[0]*pc[0]+sv[1]*pc[1])/(sv[0]+sv[1]))
'''
S_word = [['lei','lui'],
['donna','uomo'],
['madre','padre'],
['moglie','marito'],
['sorella','fratello'],
['femmina','maschio']]
'''
S_word = [['lui','lei'],
['uomo','donna'],
['padre','madre'],
['marito','moglie'],
['fratello','sorella'],
['maschio','femmina']]
G_word = db.read('gram_def_mf.txt', 'mf')
S_m_v = []
for w in S_word:
S_m_v.append(E_g.v(w[1]))
S_f_v = []
for w in S_word:
S_f_v.append(E_g.v(w[0]))
G_m_v = []
for w in G_word:
G_m_v.append(E_g.v(w[0]))
G_f_v = []
for w in G_word:
G_f_v.append(E_g.v(w[1]))
S_m = np.array(S_m_v).T
S_f = np.array(S_f_v).T
G_m = np.array(G_m_v).T
G_f = np.array(G_f_v).T
# delta semantic
Delta_S = np.dot(np.subtract(S_m, S_f), np.subtract(S_m, S_f).T)
# delta grammatical
Delta_G = np.dot(np.subtract(G_m, G_f), np.subtract(G_m, G_f).T)
# A
A = np.dot(np.linalg.inv(Delta_G), Delta_S)
# eigenvalues and eigenvectors
w, v = np.linalg.eig(A)
# gender direction: pick the eigenvector associated to the largest eigenvalue
g_e = v[:,np.argmax(w)] # already normalized
# read professions
prof_ung_truth = db.read('professions_ung.csv', 'truth')
# compute projection of each profession on g_diff
proj_truth_g_diff = db.prof_proj(E_g, prof_ung_truth, g_diff, 'istat')
# compute projection of each profession on g_pca_0
proj_truth_g_pca_0 = db.prof_proj(E_g, prof_ung_truth, g_pca_0, 'istat')
# compute projection of each profession on g_pca_01
proj_truth_g_pca_01 = db.prof_proj(E_g, prof_ung_truth, g_pca_01, 'istat')
# compute projection of each profession on g_e
proj_truth_g_e = db.prof_proj(E_g, prof_ung_truth, g_e, 'istat')
# print results
truth_col0 = [] # pandas columns
truth_col1 = []
truth_col2 = []
truth_col3 = []
truth_col4 = []
truth_col5 = []
truth_col6 = []
for i in range(len(proj_truth_g_diff)):
truth_col0.append(proj_truth_g_diff[i][0]) # profession
truth_col1.append(proj_truth_g_diff[i][1]) # projection on g_diff
truth_col2.append(-proj_truth_g_pca_0[i][1]) # projection on g_pca_0
truth_col3.append(-proj_truth_g_pca_01[i][1]) # projection on g_pca_01
truth_col4.append(-np.real(proj_truth_g_e[i][1])) # projection on g_e
truth_col5.append(proj_truth_g_diff[i][2]) # %male
truth_col6.append(proj_truth_g_diff[i][3]) # %female
truth_data = {'Profession':truth_col0,
'Proj g_diff':truth_col1,
'Proj g_pca_0':truth_col2,
'Proj g_pca_01':truth_col3,
'Prog g_e':truth_col4,
'% male':truth_col5,
'% female':truth_col6}
truth_table = pd.DataFrame(truth_data)
truth_table
truth_table.to_csv('truth_ung_g.csv', index=False)
# pearson correlation
r_g_diff = scipy.stats.pearsonr(truth_col6, truth_col1)[0]
r_g_pca_0 = scipy.stats.pearsonr(truth_col6, truth_col2)[0]
r_g_pca_01 = scipy.stats.pearsonr(truth_col6, truth_col3)[0]
r_g_e = scipy.stats.pearsonr(truth_col6, truth_col4)[0]
pearson_data = {'g_diff':r_g_diff,
'g_pca_0':r_g_pca_0,
'g_pca_01':r_g_pca_01,
'g_e':r_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# correlation plot g_diff
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_diff")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_diff')
# correlation plot g_pca_0
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_0")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_pca_0')
# correlation plot g_pca_01
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_pca_01")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_pca_01')
# correlation plot g_e
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='Data points')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.set_title("g_e")
ax.legend(facecolor='white')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/correlation/g_g_e')
# scatter plots
plt.figure(figsize=(18,18))
ax1 = plt.subplot(221)
ax1.scatter(truth_col6, truth_col1, c='blue')
ax1.set_title('g_diff')
ax1.set_xlabel('% female')
ax1.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax2 = plt.subplot(222)
ax2.scatter(truth_col6, truth_col2, c='red')
ax2.set_title('g_pca_0')
ax2.set_xlabel('% female')
ax2.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax3 = plt.subplot(223)
ax3.scatter(truth_col6, truth_col3, c='green')
ax3.set_title('g_pca_01')
ax3.set_xlabel('% female')
ax3.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
ax4 = plt.subplot(224)
ax4.scatter(truth_col6, truth_col4, c='magenta')
ax4.set_title('g_e')
ax4.set_xlabel('% female')
ax4.set_ylabel('proj value')
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_all')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.set_title('g_diff vs g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_pca_0')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_diff vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col1, c='blue', label='g_diff')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_diff vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_diff_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.set_title('g_pca_0 vs g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_0_vs_g_pca_01')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col2, c='red', label='g_pca_0')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_0 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_0_vs_g_e')
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.scatter(truth_col6, truth_col3, c='green', label='g_pca_01')
ax.scatter(truth_col6, truth_col4, c='magenta', label='g_e')
ax.set_title('g_pca_01 vs g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/ung-prof/scatter/g_g_pca_01_vs_g_e')
# read professions
prof_gen_truth = db.read('professions_gen.csv', 'truth-mf')
# compute projection of each profession on g_diff
mf_proj_truth_g_diff = db.prof_proj(E_g, prof_gen_truth, g_diff, 'istat-mf')
# compute projection of each profession on g_pca_0
mf_proj_truth_g_pca_0 = db.prof_proj(E_g, prof_gen_truth, g_pca_0, 'istat-mf')
# compute projection of each profession on g_pca_01
mf_proj_truth_g_pca_01 = db.prof_proj(E_g, prof_gen_truth, g_pca_01, 'istat-mf')
# compute projection of each profession on g_e
mf_proj_truth_g_e = db.prof_proj(E_g, prof_gen_truth, g_e, 'istat-mf')
# print results
mf_truth_col0 = [] # pandas columns
mf_truth_col1 = []
mf_truth_col2 = []
mf_truth_col3 = []
mf_truth_col4 = []
mf_truth_col5 = []
mf_truth_col6 = []
mf_truth_col7 = []
mf_truth_col8 = []
mf_truth_col9 = []
mf_truth_col10 = []
for i in range(len(mf_proj_truth_g_diff)):
mf_truth_col0.append(mf_proj_truth_g_diff[i][0]) # male profession
mf_truth_col1.append(mf_proj_truth_g_diff[i][1]) # male projection on g_diff
mf_truth_col2.append(-mf_proj_truth_g_pca_0[i][1]) # male projection on g_pca_0
mf_truth_col3.append(-mf_proj_truth_g_pca_01[i][1]) # male projection on g_pca_01
mf_truth_col4.append(-np.real(mf_proj_truth_g_e[i][1])) # male projection on g_e
mf_truth_col5.append(mf_proj_truth_g_diff[i][2]) # female profession
mf_truth_col6.append(mf_proj_truth_g_diff[i][3]) # female projection on g_diff
mf_truth_col7.append(mf_proj_truth_g_pca_0[i][3]) # female projection on g_pca_0
mf_truth_col8.append(mf_proj_truth_g_pca_01[i][3]) # female projection on g_pca_01
mf_truth_col9.append(np.real(mf_proj_truth_g_e[i][3])) # female projection on g_e
mf_truth_col10.append(mf_proj_truth_g_diff[i][4]) # %female
mf_truth_data = {'Male profession':mf_truth_col0,
'M-Proj g_diff':mf_truth_col1,
'M-Proj g_pca_0':mf_truth_col2,
'M-Proj g_pca_01':mf_truth_col3,
'M-Prog g_e':mf_truth_col4,
'Female profession':mf_truth_col5,
'F-Proj g_diff':mf_truth_col6,
'F-Proj g_pca_0':mf_truth_col7,
'F-Proj g_pca_01':mf_truth_col8,
'F-Prog g_e':mf_truth_col9,
'% female':mf_truth_col10}
mf_truth_table = pd.DataFrame(mf_truth_data)
mf_truth_table
truth_table.to_csv('truth_gen_g.csv', index=False)
# compute mean male-female projection
mf_mean_g_diff = (np.array(mf_truth_col1)+np.array(mf_truth_col6))/2
mf_mean_g_pca_0 = (np.array(mf_truth_col2)+np.array(mf_truth_col7))/2
mf_mean_g_pca_01 = (np.array(mf_truth_col3)+np.array(mf_truth_col8))/2
mf_mean_g_e = (np.array(mf_truth_col4)+np.array(mf_truth_col9))/2
# pearson correlation
r_mean_g_diff = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_diff)[0]
r_mean_g_pca_0 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_0)[0]
r_mean_g_pca_01 = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_pca_01)[0]
r_mean_g_e = scipy.stats.pearsonr(mf_truth_col10, mf_mean_g_e)[0]
pearson_data = {'g_diff':r_mean_g_diff,
'g_pca_0':r_mean_g_pca_0,
'g_pca_01':r_mean_g_pca_01,
'g_e':r_mean_g_e}
pearson_table = pd.DataFrame(pearson_data, index=['r'])
pearson_table
# scatter plots
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_diff')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col1)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_diff)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col1, linewidth=0, marker='s', color='blue', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col1, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col6, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_diff, c='brown', label='mean proj')
ax.set_title('g_diff')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col1[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col6[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_diff_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_0')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col2)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_0)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col2, linewidth=0, marker='s', color='red', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col2, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col7, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_0, c='brown', label='mean proj')
ax.set_title('g_pca_0')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col7[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col2[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_0_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_01')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col3)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_pca_01)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col3, linewidth=0, marker='s', color='green', label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col3, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col8, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_pca_01, c='brown', label='mean proj')
ax.set_title('g_pca_01')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col8[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col3[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_pca_01_mean')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_e')
slope, intercept, r, p, stderr = scipy.stats.linregress(truth_col6, truth_col4)
line = f'Ung regression line: y={intercept:.2f}+{slope:.2f}x, r={r:.2f}'
slope_mean, intercept_mean, r_mean, p_mean, stderr_mean = scipy.stats.linregress(mf_truth_col10, mf_mean_g_e)
line_mean = f'Gen regression line: y={intercept_mean:.2f}+{slope_mean:.2f}x, r={r_mean:.2f}'
fig = plt.figure(figsize=(10,10))
ax = fig.add_subplot(111)
ax.plot(truth_col6, truth_col4, linewidth=0, marker='s', color='magenta',label='ung proj')
ax.plot(np.array(truth_col6), intercept + slope * np.array(truth_col6), label=line)
ax.plot(np.array(mf_truth_col10), intercept_mean + slope_mean * np.array(mf_truth_col10), label=line_mean, color='black')
ax.scatter(mf_truth_col10, mf_truth_col4, c='cyan', label='male proj')
ax.scatter(mf_truth_col10, mf_truth_col9, c='pink', label='female proj')
ax.scatter(mf_truth_col10, mf_mean_g_e, c='brown', label='mean proj')
ax.set_title('g_e')
ax.set_xlabel('% female')
ax.set_ylabel('proj value')
ax.legend()
for i,label in enumerate(mf_truth_col0):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i,label in enumerate(mf_truth_col5):
plt.annotate(label, # this is the text
(mf_truth_col10[i], mf_truth_col9[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
for i, label in enumerate(truth_col0):
plt.annotate(label, # this is the text
(truth_col6[i], truth_col4[i]), # this is the point to label
textcoords="offset points", # how to position the text
xytext=(0,10), # distance from text to points (x,y)
ha='center') # horizontal alignment can be left, right or center
plt.show()
fig.savefig('plot/gen-prof/scatter/g_g_e_mean')